# # Maximum Number of Threads
import os
os.environ['NUMEXPR_MAX_THREADS'] = '24'
import numpy as np
import pandas as pd
# PyStan
from pystan import StanModel
# preprocessing
from sklearn import preprocessing
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## progressbar
import progressbar
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
## arviz
import arviz as az
import warnings
warnings.filterwarnings("ignore")
clear_output()
In this article, we analyze and develop models using Bayesian Methods and PyStan for the Mercari Price Suggestion Challenge.
Mercari, Japan’s biggest community-powered shopping app, knows this problem deeply. They’d like to offer pricing suggestions to sellers, but this is tough because their sellers are enabled to put just about anything, or any bundle of things, on Mercari's marketplace.
The files consist of a list of product listings. These files are tab-delimited.
| Columns | Description |
|---|---|
| train_id or test_id | the id of the listing |
| name | the title of the listing. Note that we have cleaned the data to remove text that look like prices (e.g. \$20) to avoid leakage. These removed prices are represented as (rm) |
| item_condition_id | the condition of the items provided by the seller |
| category_name | category of the listing |
| brand_name | |
| price | the price that the item was sold for. This is the target variable that you will predict. The unit is USD. This column doesn't exist in test.tsv since that is what you will predict. |
| shipping | 1 if shipping fee is paid by seller and 0 by buyer |
| item_description | the full description of the item. Note that we have cleaned the data to remove text that look like prices (e.g. \$20) to avoid leakage. These removed prices are represented as (rm) |
We will model price for all categories. Our estimate of the parameter of product price can be considered a prediction. This article is inspired by A Primer on Bayesian Methods for Multilevel Modeling.
Data = pd.read_csv('mercari-price-suggestion-challenge/train.tsv', sep = '\t')
Data.columns = [x.title().replace('Id','ID') for x in Data.columns]
Data.rename(columns = {'Shipping':'Shipping Fee'}, inplace = True)
def Data_info(Inp, Only_NaN = False):
Out = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
Out = Out.join(Inp.isnull().sum().to_frame(name = 'Number of NaN Values'), how='outer')
Out['Size'] = Inp.shape[0]
Out['Percentage'] = np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
if Only_NaN:
Out = Out.loc[Out['Number of NaN Values']>0]
return Out
def Data_Plot(Inp, Title, W = None):
data_info = Inp.dtypes.astype(str).to_frame(name='Data Type')
Temp = Inp.isnull().sum().to_frame(name = 'Number of NaN Values')
data_info = data_info.join(Temp, how='outer')
data_info ['Size'] = Inp.shape[0]
data_info['Percentage'] = 100 - np.round(100*(data_info['Number of NaN Values']/Inp.shape[0]),2)
data_info = data_info.reset_index(drop = False).rename(columns = {'index':'Features'})
#
fig = px.bar(data_info, x= 'Features', y= 'Percentage', color = 'Data Type',
text = 'Percentage',
color_discrete_sequence = ['PaleGreen', 'LightCyan', 'PeachPuff', 'Pink', 'Plum'],
hover_data = data_info.columns)
fig.update_layout(plot_bgcolor= 'white', legend=dict(x=1.01, y=.5, traceorder="normal",
bordercolor="DarkGray", borderwidth=1))
if not W == None:
fig.update_layout(width = W)
fig.update_traces(texttemplate= 10*' ' + '%%{text}', textposition='inside')
fig.update_traces(marker_line_color= 'Black', marker_line_width=1., opacity=1)
fig.update_layout(title={'text': '<b>' + Title + '<b>', 'x':0.5,
'y':0.90, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
return data_info
Data.dropna(subset = ['Category_Name'], inplace = True)
# a random sample of the dataset
Data = Data.sample(frac=0.01, random_state=99)
_ = Data_Plot(Data, Title = 'Mercari Price Suggestion', W = 700)
Temp = Data[['Price', 'Shipping Fee']]
Temp[['Shipping Fee']] = Temp[['Shipping Fee']].applymap(lambda x: 'Seller' if x == 0 else 'Buyer')
C = ['aquamarine', 'steelblue']; SC = 'Navy'
fig = px.histogram(Temp, x = 'Price', color = 'Shipping Fee', marginal ="box", histnorm ='probability density',
color_discrete_sequence = C,
title = 'Probability Density (Price)')
fig['layout']['xaxis'].update(range=[0, 100])
fig['layout']['yaxis'].update(range=[0, .1])
fig.update_layout(plot_bgcolor= 'WhiteSmoke')
fig.update_traces(marker_line_color= SC, marker_line_width=1.0, opacity=1)
fig.show()
le = preprocessing.LabelEncoder()
Data['Category'] = le.fit_transform(Data['Category_Name'])
Data['Price_Log'] = Data['Price'].apply(lambda x: np.log(x+0.1))
Category_df = Data.loc[Data.index.isin(Data['Category'].drop_duplicates().index),
['Category_Name','Category']].sort_values(by=['Category']).reset_index(drop = True)
Temp = Data[['Price_Log', 'Shipping Fee']]
Temp[['Shipping Fee']] = Temp[['Shipping Fee']].applymap(lambda x: 'Seller' if x == 0 else 'Buyer')
fig = px.histogram(Temp, x = 'Price_Log', color = 'Shipping Fee', marginal ="box", histnorm ='probability density',
color_discrete_sequence = C,
title = 'Probability Density (Price (Logarithm))')
fig['layout']['xaxis'].update(range=[-4, 8])
fig['layout']['yaxis'].update(range=[0, 3])
fig.update_layout(plot_bgcolor= 'WhiteSmoke')
fig.update_traces(marker_line_color= SC, marker_line_width=1.0, opacity=1)
fig.show()
del Temp
In this section, we would like to do a panel data analysis in which data are often collected over time and the same individuals. Then a regression is run over these two dimensions [1].
A common panel data regression model: $$y_{it}= \alpha + \beta x_{it}+\varepsilon _{it},$$ where
For more details, please see [1, 2].
Useful functions throughout the article:
def df_search(df, key): return [s for s in df.columns if key in s]
def list_search(List, key): return [s for s in List if key in s]
def fit_vars(fit):
Vars = list(fit.extract(permuted=True).keys())
try:
Vars.remove('lp__')
except:
pass
try:
Vars.remove('y_hat')
except:
pass
Vars = sorted(Vars)
return Vars
LaTex = {'alpha': r'$\alpha$', 'beta': r'$\beta$', 'sigma': r'$\sigma$',
'sigma_alpha': r'$\sigma_{\alpha}$', 'sigma_beta': r'$\sigma_{\beta}$', 'sigma_y': r'$\sigma_{y}$',
'mu_alpha': r'$\mu_{\alpha}$', 'mu_beta': r'$\mu_{\beta}$', 'y_hat': r'$\hat{y}$'}
In complete pooling, we combine all the information from all the categories into a single pool of data. Thus, treat all cathegoris the same, and estimate a single price level. $$y_{i}= \alpha + \beta x_{i}+\varepsilon _{i},$$
However, a problem with this approach is the level might be different for different categories.
The model can be defind with the following considersations: \begin{align} \begin{cases} \alpha~\sim~N\left(0,\sigma^2\right),\\ \beta~\sim~N\left(0,\sigma^2\right),\\ \end{cases} \end{align}
where $\sigma$ has a half-Cauchy distribution. A half-Cauchy is one of the symmetric halves of the Cauchy distribution (if it is unspecified, the right half that's intended)
Cauchy Probability Density Function: $$\text{Cauchy}(y|\mu,\sigma) = \frac{1}{\pi \sigma} \ \frac{1}{1 + \left((y - \mu)/\sigma\right)^2}.$$
Moreover, let
| Parameter | Description |
|---|---|
| x | Shipping Fee |
| y | Price (Log) |
model_code = """data {int<lower=0> N; vector[N] x; vector[N] y;}
parameters {real alpha; real beta; real<lower=0> sigma;}
model {y ~ normal(alpha + beta * x, sigma);}"""
IT = int(1e3)
# Pooled model
model = StanModel(model_code = model_code)
CP_fit = model.sampling(data = {'N': Data.shape[0], 'x': Data['Shipping Fee'].values,
'y': Data['Price_Log'].values}, iter= IT, chains=2)
Extract_Samples = CP_fit.extract(permuted=True)
CP_Parm = pd.DataFrame({'alpha': Extract_Samples['alpha'], 'beta': Extract_Samples['beta']})
del Extract_Samples
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_411b73098686f98516e3786c03c23d3c NOW.
# Figures
fig, ax = plt.subplots(1, 2, figsize=(16, 6.5))
ax = ax.ravel()
_ = ax[0].scatter(x= CP_Parm.index, y= CP_Parm.alpha, color = 'SkyBlue', edgecolors = 'Navy', label = LaTex['alpha'])
_ = ax[0].hlines(CP_Parm.alpha.mean(), -1, CP_Parm.shape[0]+1, linestyles='-',
color = 'OrangeRed', lw = 3, label = LaTex['mu_alpha'])
_ = ax[0].legend(bbox_to_anchor=(1, 1), fontsize = 14)
_ = ax[0].grid(True)
_ = ax[0].set_xlim([-1, CP_Parm.shape[0]+1])
_ = ax[0].set_ylim([3.06, 3.16])
_ = ax[1].scatter(x= CP_Parm.index, y= CP_Parm.beta, color = 'LimeGreen', edgecolors = 'DarkGreen', label= LaTex['beta'])
_ = ax[1].hlines(CP_Parm.beta.mean(), -1, CP_Parm.shape[0]+1, linestyles='-',
color = 'Navy', lw = 3, label = LaTex['mu_beta'])
_ = ax[1].legend(bbox_to_anchor=(1, 1), fontsize = 14)
_ = ax[1].grid(True)
_ = ax[1].set_xlim([-1, CP_Parm.shape[0]+1])
_ = ax[1].set_ylim([-.48, -.36])
Vars = ['alpha', 'beta', 'sigma']
ax = az.plot_trace(CP_fit, var_names = Vars, compact = True, rug = True , figsize = (14, 3*len(Vars)))
ax = ax.ravel()
Temp = [LaTex[x] for x in Vars]
Temp = [s for t in Temp for s in [t] * 2]
for i in range(len(ax)):
_ = ax[i].set_title(Temp[i], fontsize = 16)
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'alpha = %.4f, beta = %.4f' % (CP_Parm.mean().values[0],
CP_Parm.mean().values[1]))
t = np.linspace(-0.2, 1.2)
CP_fun = lambda x: CP_Parm.alpha.mean() + CP_Parm.beta.mean() *t
fig = go.Figure()
fig.add_trace(go.Scatter(x= Data['Shipping Fee'].values, y = Data['Price_Log'].values, mode='markers',
name='Shipping Fee - Price (Log)'))
fig.add_trace(go.Scatter(x= t, y= CP_fun(t), mode='lines', name='Fitted Model'))
fig.update_layout(title = 'Fitted Model', plot_bgcolor= 'WhiteSmoke',
xaxis_title= 'Shipping Fee', yaxis_title= 'Price (Log)', height = 500, width = 650)
fig['layout']['yaxis'].update(range=[-4, 8])
fig.update_layout(xaxis = dict(tickmode = 'array', tickvals = [0,1], ticktext = ['Buyer', 'Seller']))
fig.show()
del t
alpha = 3.1049, beta = -0.4191
Note that this is similar to a Linear Regression model (Check this article for more details regarding Linear Regression model).
Temp = Data[['Price_Log', 'Shipping Fee']]
Temp.columns = ['Price_Log', 'Shipping_Fee']
Results = smf.ols('Price_Log ~ Shipping_Fee', Temp).fit()
Results.summary().tables[1]
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 3.1050 | 0.009 | 359.542 | 0.000 | 3.088 | 3.122 |
| Shipping_Fee | -0.4196 | 0.013 | -32.717 | 0.000 | -0.445 | -0.394 |
Here, Intercept is the same as $\alpha$ and the coefficient of Shipping Fee is the same as $\beta$.
We assume that there is no connection at all between the Price (Log) levels in the different categories. In other words, we model price in each category independently. That is
$$y_{i}= \alpha_{j[i]} + \beta x_{i}+\epsilon_i,$$The model can be defind with the following considersations: \begin{align} &\epsilon_i \sim N(0, \sigma_y^2)\\ &\alpha_{j[i]}, \beta \sim N(\mu, \sigma^2) \end{align}
model_code = """data {int<lower=0> N; int<lower=0> M; int<lower=1,upper=M> category[N]; vector[N] x; vector[N] y;}
parameters {vector[M] alpha; real beta; real<lower=0,upper=100> sigma;}
transformed parameters {vector[N] y_hat;
for (i in 1:N)
y_hat[i] = alpha[category[i]] + beta * x[i];}
model {y ~ normal(y_hat, sigma);}"""
IT = int(1e3)
model = StanModel(model_code = model_code)
NP_fit = model.sampling(data = {'N': Data.shape[0],
'M': len(Data['Category'].unique()),
'category': Data['Category'].values + 1,
'x': Data['Shipping Fee'].values,
'y': Data['Price_Log'].values},
iter= IT, chains=2)
Extract_Samples = NP_fit.extract(permuted=True)
# Un-pooled (No Pool) Parmameters
NP_Parm = pd.DataFrame(Extract_Samples['alpha'], columns = ['alpha%i' %i for i in range(len(Data['Category'].unique()))])
NP_Parm['beta'] = Extract_Samples['beta']
del Extract_Samples
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_8aa7bda1b995aaf0ac1bc89a13ee18d6 NOW. WARNING:pystan:Maximum (flat) parameter count (1000) exceeded: skipping diagnostic tests for n_eff and Rhat. To run all diagnostics call pystan.check_hmc_diagnostics(fit)
fig, ax = plt.subplots(1, 1, figsize=(16, 6.5))
Temp = NP_Parm.copy()
Temp = Temp[df_search(Temp, 'beta')]
_ = ax.scatter(x= Temp.index, y= Temp['beta'], color = 'SkyBlue', edgecolors = 'Navy')
_ = ax.hlines(Temp.mean(), -1, len(Temp.index)+1, linestyles='-',
color = 'OrangeRed', lw = 3, label = LaTex['mu_alpha'])
_ = ax.set_title(r'$\beta$', fontsize = 16)
_ = ax.grid(True)
_ = ax.set_xlim([-1, len(NP_Parm.index)+1])
_ = ax.set_ylim([-.36, -.24])
del Temp
Vars = fit_vars(NP_fit)
ax = az.plot_trace(NP_fit, var_names = Vars, compact = True, rug = True , figsize = (14, 3*len(Vars)))
ax = ax.ravel()
Temp = [LaTex[x] for x in Vars]
Temp = [s for t in Temp for s in [t] * 2]
for i in range(len(ax)):
_ = ax[i].set_title(Temp[i], fontsize = 16)
if len(Vars)%2 ==1:
fig, ax = plt.subplots(int(np.floor(len(Vars)/2) + 1), 2, figsize = (14, 2*len(Vars)))
else:
fig, ax = plt.subplots(int(np.floor(len(Vars)/2)), 2, figsize = (14, 2*len(Vars)))
_ = az.plot_posterior(NP_fit, var_names=Vars, point_estimate = 'mean', ax = ax)
Temp = [LaTex[x] for x in Vars]
ax = ax.ravel()
for i in range(len(Vars)):
_ = ax[i].set_title(Temp[i], fontsize = 16)
if (len(Vars)< len(ax)):
for i in np.arange(len(Vars), len(ax)):
ax[i].set_axis_off()
plt.subplots_adjust(hspace = (0.2)* np.floor(len(Vars)/2))
A plot of the ordered Estimates:
Temp = NP_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean','std'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
fig, ax = plt.subplots(2, 1, figsize=(15.5, 14))
# Top
_ = ax[0].scatter(x = Temp.index, y = Temp['mean'].values, color = 'b')
for i, mean, std in zip( Temp.index, Temp['mean'].values, Temp['std'].values):
_ = ax[0].plot([i,i], [mean - std, mean + std], 'r-', alpha = 0.5)
_ = ax[0].set_xlim([-1, len(Temp.index) +1])
_ = ax[0].set_ylim([-1, 8])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].set_xlabel('Ordered Categories')
_ = ax[0].set_title('Variation in Estimated Price by Category', fontsize = 18)
# Bottom
Temp = NP_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
n = 50
_ = sns.boxplot(ax = ax[1], data = Temp.iloc[:,:n])
_ = ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)
_ = ax[1].set_ylim([-1, 8])
_ = ax[1].set_title('Estimated Price Distributions with respect to the Categories', fontsize = 18)
_ = ax[1].set_xlabel('Category Name')
del Temp
fig, ax = plt.subplots(figsize=(7, 6))
t = np.arange(2)
Temp = NP_Parm.copy()
beta = Temp['beta'].mean()
Temp = Temp[df_search(Temp, 'alpha')]
for alpha in Temp.mean(axis=0):
ax.plot(t, alpha + beta*t, color='DarkBlue', marker='o', linestyle='--',
linewidth=1, markersize=4, alpha=0.4)
delta = 0.1
_ = ax.set_xlim([t[0]-delta, t[1]+delta])
_ = ax.set_ylim([1, 5])
_ = ax.set_xticks([0, 1])
_ = ax.set_xticklabels(['Buyer', 'Seller'])
_ = ax.set_title('Fitted Relationships by Category')
_ = ax.set_xlabel("Shipping Fee by")
_ = ax.set_ylabel(' Estimated Price (Logarithm Scale)')
del alpha, beta, delta
A Visual comparisons between the pooled and unpooled estimates for a subset of categories:
Temp = NP_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
s = 4
fig, ax = plt.subplots(4, 3, figsize=(14, 14))
ax = ax.ravel()
lines = []
labels = []
for i, c in enumerate(Temp.Categories[:12]):
y = Data.loc[Data.Category_Name==c, 'Price_Log'].values
x = Data.loc[Data.Category_Name==c, 'Shipping Fee'].values
NP_fun = lambda t: Temp.loc[Temp.Categories == c, 'mean'].values[0] + t*NP_Parm['beta'].mean()
ax[i].scatter(x + np.random.randn(len(x))*0.01, y, alpha=0.4)
t = np.linspace(-0.2, 1.2)
ax[i].plot(t, NP_fun(t), 'k', lw = 2, label = 'Complete Pooling')
ax[i].plot(t, CP_fun(t), 'r', lw = 2, label = 'No Pooling')
ax[i].set_xticks([0,1])
ax[i].set_title(c)
ax[i].set_ylim([0, 8])
ax[i].set_xlabel('Shipping Fee')
ax[i].set_ylabel('Price (Log)')
axLine, axLabel = ax[i].get_legend_handles_labels()
lines.extend(axLine)
labels.extend(axLabel)
del axLine, axLabel
fig.legend(lines[:2], labels[:2], bbox_to_anchor=(0.2, 1), loc='lower left',
ncol=2, title = 'Fitted Model', mode="expand", borderaxespad=0 , fontsize = 14)
plt.tight_layout()
There are Few approaches that we discuss here. For example,
The simplest possible partial pooling model for the retail price dataset is one that simply estimates prices, $\alpha$, with no other predictors and ignoring the effect of shipping, $\beta$.
In doing so, let, $$y_i = \alpha_{j[i]} + \epsilon_i$$
where \begin{align} &\epsilon_i \sim N(0, \sigma_y^2)\\ &\alpha_{j[i]} \sim N(\mu_{\alpha}, \sigma_{\alpha}^2) \end{align}
model_code = """data {int<lower=0> N; int<lower=0> M; int<lower=1,upper=M> category[N]; vector[N] y;}
parameters {vector[M] alpha; real mu_alpha;
real<lower=0,upper=100> sigma_alpha; real<lower=0,upper=100> sigma_y;}
transformed parameters {vector[N] y_hat;
for (i in 1:N)
y_hat[i] = alpha[category[i]];}
model {mu_alpha ~ normal(0, 1); alpha ~ normal (mu_alpha, sigma_alpha); y ~ normal(y_hat, sigma_y);}"""
IT = int(1e3)
model = StanModel(model_code = model_code)
PP_fit = model.sampling(data = {'N': Data.shape[0],
'M': len(Data['Category'].unique()),
'category': Data['Category'].values + 1,
'y': Data['Price_Log'].values},
iter= IT, chains=2)
Extract_Samples = PP_fit.extract(permuted=True)
# Partial Pooling Parmameters
PP_Parm = pd.DataFrame(Extract_Samples['alpha'], columns = ['alpha%i' %i for i in range(len(Data['Category'].unique()))])
del Extract_Samples
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_0727cfb6bf80e3019e25eccebf3d9ffa NOW. WARNING:pystan:Maximum (flat) parameter count (1000) exceeded: skipping diagnostic tests for n_eff and Rhat. To run all diagnostics call pystan.check_hmc_diagnostics(fit)
Vars = ['sigma_y', 'alpha', 'sigma_alpha', 'mu_alpha']
Vars = sorted(Vars)
ax = az.plot_trace(PP_fit, var_names = Vars, compact = True, rug = True , figsize = (14, 3*len(Vars)))
ax = ax.ravel()
Temp = [LaTex[x] for x in Vars]
Temp = [s for t in Temp for s in [t] * 2]
for i in range(len(ax)):
_ = ax[i].set_title(Temp[i], fontsize = 16)
if len(Vars)%2 ==1:
fig, ax = plt.subplots(int(np.floor(len(Vars)/2) + 1), 2, figsize = (14, 2*len(Vars)))
else:
fig, ax = plt.subplots(int(np.floor(len(Vars)/2)), 2, figsize = (14, 2*len(Vars)))
_ = az.plot_posterior(PP_fit, var_names=Vars, point_estimate = 'mean', ax = ax)
Temp = [LaTex[x] for x in Vars]
ax = ax.ravel()
for i in range(len(Vars)):
_ = ax[i].set_title(Temp[i], fontsize = 16)
if (len(Vars)< len(ax)):
for i in np.arange(len(Vars), len(ax)):
ax[i].set_axis_off()
plt.subplots_adjust(hspace = (0.2)* np.floor(len(Vars)/2))
Temp = PP_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean','std'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
fig, ax = plt.subplots(2, 1, figsize=(15.5, 14))
# Top
_ = ax[0].scatter(x = Temp.index, y = Temp['mean'].values, color = 'b')
for i, mean, std in zip( Temp.index, Temp['mean'].values, Temp['std'].values):
_ = ax[0].plot([i,i], [mean - std, mean + std], 'r-', alpha = 0.5)
_ = ax[0].set_xlim([-1, len(Temp.index) +1])
_ = ax[0].set_ylim([1, 5])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].set_xlabel('Ordered Categories')
_ = ax[0].set_title('Variation in Estimated Price by Category', fontsize = 18)
# Bottom
Temp = PP_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
n = 50
_ = sns.boxplot(ax = ax[1], data = Temp.iloc[:,:n])
_ = ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)
_ = ax[1].set_ylim([0, 6])
_ = ax[1].set_title('Estimated Price Distributions with respect to the Categories', fontsize = 18)
_ = ax[1].set_xlabel('Category Name')
del Temp
Standard Error $$SE = \frac{\sigma} {\sqrt{n}}$$
Temp = PP_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Group = pd.DataFrame({'n': Data.groupby('Category_Name')['Train_ID'].count(),
'mean': Data.groupby('Category_Name')['Price_Log'].mean(),
'sd': Data.groupby('Category_Name')['Price_Log'].std()})
Group['se'] = Group['sd']/np.sqrt(Group['n'])
# deviations
jitter = np.random.normal(scale=0.1, size= Temp.shape[1])
fig, ax = plt.subplots(1, 2, figsize=(16,8))
_ = ax[0].scatter(Group['n'] + jitter, Group['mean'], color = 'SkyBlue', edgecolors = 'Navy')
for j, row in zip(jitter, Group.iterrows()):
n, mean, se = n, mean, se = row[1]['n'], row[1]['mean'], row[1]['se']
_ = ax[0].plot([n + j, n + j], [mean - se, mean + se], 'k-', alpha = 0.5)
_ = ax[0].set_xscale('log')
_ = ax[0].hlines(Temp.values.mean(), 0, 1000, linestyles='--', color = 'red', lw = 2)
_ = ax[0].set_ylim([0,6])
_ = ax[0].set_xlim([0,1e3])
_ = ax[0].set_title('No Pool Model Estimates Mean')
_ = ax[1].scatter(Group['n'] + jitter, Temp.mean(axis=0), color = 'SkyBlue', edgecolors = 'Navy')
_ = ax[1].hlines(Temp.values.mean(), 0, 1000, linestyles='--', color = 'red', lw = 2)
_ = ax[1].set_xscale('log')
_ = ax[1].set_xlim([0,1e3])
_ = ax[1].set_ylim([0,6])
_ = ax[1].set_title('Partial Pool Model Estimates Mean')
for j, n , mean , std in zip(jitter, Group['n'].values, Temp.mean(axis=0), Temp.std(axis=0)):
_ = ax[1].plot([n+j, n+j], [mean- std, mean + std], 'k-', alpha = 0.5)
del j, n, mean, std, Temp, Group
The unpooled estimates are more imprecise than the partial pool estimates.
This model allows intercepts to vary across the category, according to a random effect.
$$y_i = \alpha_{j[i]} + \beta x_{i} + \epsilon_i$$where \begin{align} &\epsilon_i \sim N(0, \sigma_y^2)\\ &\alpha_{j[i]} \sim N(\mu_{\alpha}, \sigma_{\alpha}^2) \end{align}
model_code = """data {int<lower=0> N; int<lower=0> M; int<lower=1,upper=M> category[N]; vector[N] x; vector[N] y;}
parameters {vector[M] alpha; real beta; real mu_alpha;
real<lower=0,upper=100> sigma_alpha; real<lower=0,upper=100> sigma_y;}
transformed parameters {
vector[N] y_hat;
for (i in 1:N)
y_hat[i] = alpha[category[i]] + x[i]*beta;}
model {sigma_alpha ~ uniform(0, 100); alpha ~ normal (mu_alpha, sigma_alpha);
beta ~ normal (0, 1); sigma_y ~ uniform(0, 100); y ~ normal(y_hat, sigma_y);}"""
IT = int(1e3)
model = StanModel(model_code = model_code)
VI_fit = model.sampling(data = {'N': Data.shape[0],
'M': len(Data['Category'].unique()),
'category': Data['Category'].values + 1,
'x': Data['Shipping Fee'].values,
'y': Data['Price_Log'].values},
iter= IT, chains=2)
Extract_Samples = VI_fit.extract(permuted=True)
# Varying Intercept Parmameters
VI_Parm = pd.DataFrame(Extract_Samples['alpha'], columns = ['alpha%i' %i for i in range(len(Data['Category'].unique()))])
VI_Parm['beta'] = VI_fit['beta']
del Extract_Samples
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_b99073f13080d6f65bb96e4568e5d8f1 NOW. WARNING:pystan:Maximum (flat) parameter count (1000) exceeded: skipping diagnostic tests for n_eff and Rhat. To run all diagnostics call pystan.check_hmc_diagnostics(fit)
Vars = fit_vars(VI_fit)
ax = az.plot_trace(VI_fit, var_names = Vars, compact = True, rug = True , figsize = (14, 3*len(Vars)))
ax = ax.ravel()
Temp = [LaTex[x] for x in Vars]
Temp = [s for t in Temp for s in [t] * 2]
for i in range(len(ax)):
_ = ax[i].set_title(Temp[i], fontsize = 16)
if len(Vars)%2 ==1:
fig, ax = plt.subplots(int(np.floor(len(Vars)/2) + 1), 2, figsize = (14, 2*len(Vars)))
else:
fig, ax = plt.subplots(int(np.floor(len(Vars)/2)), 2, figsize = (14, 2*len(Vars)))
_ = az.plot_posterior(VI_fit, var_names=Vars, point_estimate = 'mean', ax = ax)
Temp = [LaTex[x] for x in Vars]
ax = ax.ravel()
for i in range(len(Vars)):
_ = ax[i].set_title(Temp[i], fontsize = 16)
if (len(Vars)< len(ax)):
for i in np.arange(len(Vars), len(ax)):
ax[i].set_axis_off()
plt.subplots_adjust(hspace = (0.2)* np.floor(len(Vars)/2))
Temp = VI_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean','std'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
fig, ax = plt.subplots(2, 1, figsize=(15.5, 14))
# Top
_ = ax[0].scatter(x = Temp.index, y = Temp['mean'].values, color = 'b')
for i, mean, std in zip( Temp.index, Temp['mean'].values, Temp['std'].values):
_ = ax[0].plot([i,i], [mean - std, mean + std], 'r-', alpha = 0.5)
_ = ax[0].set_xlim([-1, len(Temp.index) +1])
_ = ax[0].set_ylim([1, 6])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].set_xlabel('Ordered Categories')
_ = ax[0].set_title('Variation in Estimated Price by Category', fontsize = 18)
# Bottom
Temp = VI_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
n = 50
_ = sns.boxplot(ax = ax[1], data = Temp.iloc[:,:n])
_ = ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)
_ = ax[1].set_ylim([0, 6])
_ = ax[1].set_title('Estimated Price Distributions with respect to the Categories', fontsize = 18)
_ = ax[1].set_xlabel('Category Name')
del Temp
fig, ax = plt.subplots(figsize=(7, 6))
t = np.arange(2)
Temp = VI_Parm.copy()
for alpha in Temp[df_search(Temp, 'alpha')].mean(axis=0):
ax.plot(t, alpha + Temp['beta'].mean()*t, color='DarkBlue', marker='o', linestyle='--',
linewidth=1, markersize=4, alpha=0.4)
delta = 0.1
_ = ax.set_xlim([t[0]-delta, t[1]+delta])
_ = ax.set_ylim([1, 5])
_ = ax.set_xticks([0, 1])
_ = ax.set_xticklabels(['Buyer', 'Seller'])
_ = ax.set_title('Fitted Relationships by Category')
_ = ax.set_xlabel("Shipping Fee by")
_ = ax.set_ylabel(' Estimated Price (Logarithm Scale)')
del alpha
Lets's pick a category. For example,
Temp = list_search(Category_df.iloc[:,0].to_list(), 'Styling Products')[0]
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Selected Category:' + Style.RESET_ALL + ' %s' % Temp)
Selected_Category = Category_df.loc[Category_df.Category_Name == Temp, 'Category'].values[0]
del Temp
Selected Category: Beauty/Hair Care/Styling Products
model_code = """data {int<lower=0> N; int<lower=0> M;
int<lower=1,upper=M> category[N];
int<lower=0,upper=M> pred;
vector[N] x; vector[N] y;
real x_pred;}
parameters {vector[M] alpha; real beta; real mu_alpha;
real<lower=0,upper=100> sigma_alpha; real<lower=0,upper=100> sigma_y;}
transformed parameters {vector[N] y_hat; real pred_mu;
for (i in 1:N)
y_hat[i] = alpha[category[i]] + x[i]*beta;
pred_mu = alpha[pred + 1] + x_pred * beta;}
model {sigma_alpha ~ uniform(0, 100); alpha ~ normal (mu_alpha, sigma_alpha);
beta ~ normal (0, 1); sigma_y ~ uniform(0, 100); y ~ normal(y_hat, sigma_y);}
generated quantities {real y_pred;
y_pred = normal_rng(pred_mu, sigma_y);}
"""
IT = int(1e3)
model = StanModel(model_code = model_code)
VI_pred = model.sampling(data = {'N': Data.shape[0],
'M': len(Data['Category'].unique()),
'category': Data['Category'].values + 1,
'x': Data['Shipping Fee'].values,
'y': Data['Price_Log'].values,
'pred': Selected_Category,
'x_pred': Data.loc[Data.Category == Selected_Category,'Shipping Fee'].values.mean()
},
iter= IT, chains=2)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_2d2f1b23c36143764c9da57b5a5630f7 NOW. WARNING:pystan:Maximum (flat) parameter count (1000) exceeded: skipping diagnostic tests for n_eff and Rhat. To run all diagnostics call pystan.check_hmc_diagnostics(fit)
_ = az.plot_trace(VI_pred, var_names = ['y_pred'], compact = True, rug = True , figsize = (14, 4))
_ = az.plot_posterior(VI_pred, var_names = ['y_pred'], point_estimate = 'mean', figsize = (7, 4))
fig, ax = plt.subplots(1, 2, figsize=(16, 6.5))
ax = ax.ravel()
Temp0 = VI_pred.extract(permuted=True)['y_pred']
_ = ax[0].scatter(x= np.arange(len(Temp0)), y= Temp0, color = 'SkyBlue', edgecolors = 'Navy')
_ = ax[0].hlines(Temp0.mean(), -1, len(Temp0)+1, linestyles='-',
color = 'OrangeRed', lw = 3, label = 'mean = %.2f' % Temp0.mean())
_ = ax[0].set_title('Predictions: %s' % Category_df.loc[Category_df.Category == Selected_Category,
'Category_Name'].values[0], fontsize = 16)
_ = ax[0].grid(True)
_ = ax[0].set_xlim([-1, len(Temp0)+1])
_ = ax[0].set_ylim([-1, 6])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].legend(bbox_to_anchor=(1, 1), fontsize = 14)
Temp0 = np.exp(Temp0)
_ = ax[1].scatter(x= np.arange(len(Temp0)), y= Temp0, color = 'MediumSeaGreen', edgecolors = 'Navy')
_ = ax[1].hlines(Temp0.mean(), -1, len(Temp0)+1, linestyles='-',
color = 'OrangeRed', lw = 3, label = 'mean = %.2f' % Temp0.mean())
_ = ax[1].set_title('Predictions: %s' % Category_df.loc[Category_df.Category == Selected_Category,
'Category_Name'].values[0], fontsize = 16)
_ = ax[1].grid(True)
_ = ax[1].set_xlim([-1, len(Temp0)+1])
_ = ax[1].set_ylim([0, 160])
_ = ax[1].set_ylabel('Estimated Price')
_ = ax[1].legend(bbox_to_anchor=(1, 1), fontsize = 14)
del Temp0
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Exact:')
display(Data.loc[Data.Category == Selected_Category, ['Price', 'Price_Log']].describe().T)
Exact:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Price | 28.0 | 17.857143 | 10.582005 | 4.000000 | 8.750000 | 17.500000 | 24.000000 | 40.000000 |
| Price_Log | 28.0 | 2.692309 | 0.674433 | 1.410987 | 2.179172 | 2.867495 | 3.182212 | 3.691376 |
An alternative would be
$$y_i = \alpha + \beta_{j[i]} x_{i} + \epsilon_i$$where \begin{align} \begin{cases} \epsilon_i \sim N(0, \sigma_y^2)\\ \alpha \sim N(0, \sigma^2)\\ \beta_{j[i]} \sim N(\mu_{\beta}, \sigma_{\beta}^2) \end{cases} \end{align}
This model highlights the relationship between measured the logarithm of the price, the prevailing price level, and the effect of who pays the shipping fee at which the measurement was made.
model_code = """data {int<lower=0> N; int<lower=0> M; int<lower=1,upper=M> category[N]; vector[N] x; vector[N] y;}
parameters {real alpha; vector[M] beta; real mu_beta; real<lower=0,upper=100> sigma_beta;
real<lower=0,upper=100> sigma_y;}
transformed parameters {
vector[N] y_hat;
for (i in 1:N)
y_hat[i] <- alpha + beta[category[i]] * x[i];}
model {sigma_beta ~ uniform(0, 100); beta ~ normal (mu_beta, sigma_beta);
alpha ~ normal (0, 1); sigma_y ~ uniform(0, 100); y ~ normal(y_hat, sigma_y);}"""
IT = int(1e3)
model = StanModel(model_code = model_code)
VS_fit = model.sampling(data = {'N': Data.shape[0],
'M': len(Data['Category'].unique()),
'category': Data['Category'].values + 1,
'x': Data['Shipping Fee'].values,
'y': Data['Price_Log'].values},
iter= IT, chains=2)
Extract_Samples = VS_fit.extract(permuted=True)
# Varying Slope Parmameters
VS_Parm = pd.DataFrame(Extract_Samples['beta'], columns = ['beta%i' %i for i in range(len(Data['Category'].unique()))])
VS_Parm['alpha'] = VS_fit['alpha']
del Extract_Samples
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_65e6829fce2dc08bd6b7cd5b2e245975 NOW. WARNING:pystan:Maximum (flat) parameter count (1000) exceeded: skipping diagnostic tests for n_eff and Rhat. To run all diagnostics call pystan.check_hmc_diagnostics(fit)
Vars = fit_vars(VS_fit)
Vars = sorted(Vars)
ax = az.plot_trace(VS_fit, var_names = Vars, compact = True, rug = True , figsize = (14, 3*len(Vars)))
ax = ax.ravel()
Temp = [LaTex[x] for x in Vars]
Temp = [s for t in Temp for s in [t] * 2]
for i in range(len(ax)):
_ = ax[i].set_title(Temp[i], fontsize = 16)
if len(Vars)%2 ==1:
fig, ax = plt.subplots(int(np.floor(len(Vars)/2) + 1), 2, figsize = (14, 2*len(Vars)))
else:
fig, ax = plt.subplots(int(np.floor(len(Vars)/2)), 2, figsize = (14, 2*len(Vars)))
_ = az.plot_posterior(VS_fit, var_names=Vars, point_estimate = 'mean', ax = ax)
Temp = [LaTex[x] for x in Vars]
ax = ax.ravel()
for i in range(len(Vars)):
_ = ax[i].set_title(Temp[i], fontsize = 16)
if (len(Vars)< len(ax)):
for i in np.arange(len(Vars), len(ax)):
ax[i].set_axis_off()
plt.subplots_adjust(hspace = (0.2)* np.floor(len(Vars)/2))
Temp = VS_Parm.copy()
Temp = Temp[df_search(Temp, 'beta')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean','std'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
fig, ax = plt.subplots(2, 1, figsize=(15.5, 14))
# Top
_ = ax[0].scatter(x = Temp.index, y = Temp['mean'].values, color = 'b')
for i, mean, std in zip( Temp.index, Temp['mean'].values, Temp['std'].values):
_ = ax[0].plot([i,i], [mean - std, mean + std], 'r-', alpha = 0.5)
_ = ax[0].set_xlim([-1, len(Temp.index) +1])
_ = ax[0].set_ylim([-2, 2])
_ = ax[0].set_ylabel('The Effect of the Paid Shipping Fees (Logarithm Scale)')
_ = ax[0].set_xlabel('Ordered Categories')
_ = ax[0].set_title('Variation in Estimated Price by Category', fontsize = 18)
# Bottom
Temp = VS_Parm.copy()
Temp = Temp[df_search(Temp, 'beta')]
Temp.columns = Category_df['Category_Name']
n = 50
_ = sns.boxplot(ax = ax[1], data = Temp.iloc[:,:n])
_ = ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)
_ = ax[1].set_ylim([-3, 3])
_ = ax[1].set_title('The Effect of the Paid Shipping Fees Distributions with respect to Categories', fontsize = 18)
_ = ax[1].set_xlabel('Category Name')
del Temp
fig, ax = plt.subplots(figsize=(7, 6))
t = np.arange(2)
Temp = VS_Parm.copy()
for beta in Temp[df_search(Temp, 'beta')].mean(axis=0):
ax.plot(t, Temp['alpha'].mean() + beta*t, color='DarkBlue', marker='o', linestyle='--',
linewidth=1, markersize=4, alpha=0.4)
delta = 0.1
_ = ax.set_xlim([t[0]-delta, t[1]+delta])
_ = ax.set_ylim([1, 5])
_ = ax.set_xticks([0, 1])
_ = ax.set_xticklabels(['Buyer', 'Seller'])
_ = ax.set_title('Fitted Relationships by Category')
_ = ax.set_xlabel("Shipping Fee by")
_ = ax.set_ylabel(' Estimated Price (Logarithm Scale)')
Temp = list_search(Category_df.iloc[:,0].to_list(), 'Styling Products')[0]
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Selected Category:' + Style.RESET_ALL + ' %s' % Temp)
Selected_Category = Category_df.loc[Category_df.Category_Name == Temp, 'Category'].values[0]
del Temp
Selected Category: Beauty/Hair Care/Styling Products
model_code = """data {int<lower=0> N; int<lower=0> M;
int<lower=1,upper=M> category[N];
int<lower=0,upper=M> pred;
vector[N] x; vector[N] y;
real x_pred;}
parameters {real alpha; vector[M] beta; real mu_alpha;
real<lower=0,upper=100> sigma_alpha; real<lower=0,upper=100> sigma_y;}
transformed parameters {vector[N] y_hat; real pred_mu;
for (i in 1:N)
y_hat[i] = alpha + x[i]*beta[category[i]];
pred_mu = alpha + x_pred * beta[pred + 1];}
model {sigma_alpha ~ uniform(0, 100); alpha ~ normal (mu_alpha, sigma_alpha);
beta ~ normal (0, 1); sigma_y ~ uniform(0, 100); y ~ normal(y_hat, sigma_y);}
generated quantities {real y_pred;
y_pred = normal_rng(pred_mu, sigma_y);}
"""
IT = int(1e3)
model = StanModel(model_code = model_code)
VS_pred = model.sampling(data = {'N': Data.shape[0],
'M': len(Data['Category'].unique()),
'category': Data['Category'].values + 1,
'x': Data['Shipping Fee'].values,
'y': Data['Price_Log'].values,
'pred': Selected_Category,
'x_pred': Data.loc[Data.Category == Selected_Category,'Shipping Fee'].values.mean()
},
iter= IT, chains=2)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_c440dcc8f99272d7ac526fced2cc6489 NOW. WARNING:pystan:Maximum (flat) parameter count (1000) exceeded: skipping diagnostic tests for n_eff and Rhat. To run all diagnostics call pystan.check_hmc_diagnostics(fit) WARNING:pystan:34 of 1000 iterations ended with a divergence (3.4 %). WARNING:pystan:Try running with adapt_delta larger than 0.8 to remove the divergences.
_ = az.plot_trace(VS_pred, var_names = ['y_pred'], compact = True, rug = True , figsize = (14, 4))
_ = az.plot_posterior(VS_pred, var_names = ['y_pred'], point_estimate = 'mean', figsize = (7, 4))
fig, ax = plt.subplots(1, 2, figsize=(16, 6.5))
ax = ax.ravel()
Temp0 = VS_pred['y_pred']
_ = ax[0].scatter(x= np.arange(len(Temp0)), y= Temp0, color = 'SkyBlue', edgecolors = 'Navy')
_ = ax[0].hlines(Temp0.mean(), -1, len(Temp0)+1, linestyles='-',
color = 'OrangeRed', lw = 3, label = 'mean = %.2f' % Temp0.mean())
_ = ax[0].set_title('Predictions: %s' % Category_df.loc[Category_df.Category == Selected_Category,
'Category_Name'].values[0], fontsize = 16)
_ = ax[0].grid(True)
_ = ax[0].set_xlim([-1, len(Temp0)+1])
_ = ax[0].set_ylim([-1, 6])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].legend(bbox_to_anchor=(1, 1), fontsize = 14)
Temp0 = np.exp(Temp0)
_ = ax[1].scatter(x= np.arange(len(Temp0)), y= Temp0, color = 'MediumSeaGreen', edgecolors = 'Navy')
_ = ax[1].hlines(Temp0.mean(), -1, len(Temp0)+1, linestyles='-',
color = 'OrangeRed', lw = 3, label = 'mean = %.2f' % Temp0.mean())
_ = ax[1].set_title('Predictions: %s' % Category_df.loc[Category_df.Category == Selected_Category,
'Category_Name'].values[0], fontsize = 16)
_ = ax[1].grid(True)
_ = ax[1].set_xlim([-1, len(Temp0)+1])
_ = ax[1].set_ylim([0, 160])
_ = ax[1].set_ylabel('Estimated Price')
_ = ax[1].legend(bbox_to_anchor=(1, 1), fontsize = 14)
del Temp0
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Exact:')
display(Data.loc[Data.Category == Selected_Category, ['Price', 'Price_Log']].describe().T)
Exact:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Price | 28.0 | 17.857143 | 10.582005 | 4.000000 | 8.750000 | 17.500000 | 24.000000 | 40.000000 |
| Price_Log | 28.0 | 2.692309 | 0.674433 | 1.410987 | 2.179172 | 2.867495 | 3.182212 | 3.691376 |
The most general model allows both the intercept and slope to vary by category:
$$y_i = \alpha_{j[i]} + \beta_{j[i]} x_{i} + \epsilon_i$$where \begin{align} \begin{cases} \epsilon_i \sim N(0, \sigma_y^2),\\ \alpha_{j[i]} \sim N(\mu_{\alpha}, \sigma_{\alpha}^2),\\ \beta_{j[i]} \sim N(\mu_{\beta}, \sigma_{\beta}^2). \end{cases} \end{align}
model_code = """data {int<lower=0> N; int<lower=0> M; vector[N] x; vector[N] y; int category[N];}
parameters {vector[M] alpha; vector[M] beta;
real mu_alpha; real mu_beta;
real<lower=0> sigma; real<lower=0> sigma_alpha; real<lower=0> sigma_beta;}
model {mu_alpha ~ normal(0, 100); mu_beta ~ normal(0, 100);
alpha ~ normal(mu_alpha, sigma_alpha); beta ~ normal(mu_beta, sigma_beta);
y ~ normal(alpha[category] + beta[category].*x, sigma);}"""
IT = int(1e3)
model = StanModel(model_code = model_code)
VSI_fit = model.sampling(data = {'N': Data.shape[0],
'M': len(Data['Category'].unique()),
'category': Data['Category'].values + 1,
'x': Data['Shipping Fee'].values,
'y': Data['Price_Log'].values},
iter= IT, chains=2)
Extract_Samples = VSI_fit.extract(permuted=True)
# Varying Slope Parmameters
VSI_Parm = pd.DataFrame(Extract_Samples['alpha'], columns = ['alpha%i' %i for i in range(len(Data['Category'].unique()))])
Temp = pd.DataFrame(Extract_Samples['beta'], columns = ['beta%i' %i for i in range(len(Data['Category'].unique()))])
VSI_Parm = pd.concat([VSI_Parm,Temp], axis =1)
del Extract_Samples
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_07cfdb859f961538dc97402c518711de NOW. WARNING:pystan:Maximum (flat) parameter count (1000) exceeded: skipping diagnostic tests for n_eff and Rhat. To run all diagnostics call pystan.check_hmc_diagnostics(fit)
Vars = fit_vars(VSI_fit)
ax = az.plot_trace(VSI_fit, var_names = Vars, compact = True, rug = True , figsize = (14, 3*len(Vars)))
ax = ax.ravel()
Temp = [LaTex[x] for x in Vars]
Temp = [s for t in Temp for s in [t] * 2]
for i in range(len(ax)):
_ = ax[i].set_title(Temp[i], fontsize = 16)
if len(Vars)%2 ==1:
fig, ax = plt.subplots(int(np.floor(len(Vars)/2) + 1), 2, figsize = (14, 2*len(Vars)))
else:
fig, ax = plt.subplots(int(np.floor(len(Vars)/2)), 2, figsize = (14, 2*len(Vars)))
_ = az.plot_posterior(VSI_fit, var_names=Vars, point_estimate = 'mean', ax = ax)
Temp = [LaTex[x] for x in Vars]
ax = ax.ravel()
for i in range(len(Vars)):
_ = ax[i].set_title(Temp[i], fontsize = 16)
if (len(Vars)< len(ax)):
for i in np.arange(len(Vars), len(ax)):
ax[i].set_axis_off()
plt.subplots_adjust(hspace = (0.2)* np.floor(len(Vars)/2))
Temp = VSI_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean','std'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
fig, ax = plt.subplots(2, 1, figsize=(15.5, 14))
# Top
_ = ax[0].scatter(x = Temp.index, y = Temp['mean'].values, color = 'b')
for i, mean, std in zip( Temp.index, Temp['mean'].values, Temp['std'].values):
_ = ax[0].plot([i,i], [mean - std, mean + std], 'r-', alpha = 0.5)
_ = ax[0].set_xlim([-1, len(Temp.index) +1])
_ = ax[0].set_ylim([1, 6])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].set_xlabel('Ordered Categories')
_ = ax[0].set_title('Variation in Estimated Price by Category', fontsize = 18)
# Bottom
Temp = VSI_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
n = 50
_ = sns.boxplot(ax = ax[1], data = Temp.iloc[:,:n])
_ = ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)
_ = ax[1].set_ylim([0, 6])
_ = ax[1].set_title('Estimated Price Distributions with respect to the Categories', fontsize = 18)
_ = ax[1].set_xlabel('Category Name')
del Temp
Temp = VSI_Parm.copy()
Temp = Temp[df_search(Temp, 'beta')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean','std'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
fig, ax = plt.subplots(2, 1, figsize=(15.5, 14))
# Top
_ = ax[0].scatter(x = Temp.index, y = Temp['mean'].values, color = 'b')
for i, mean, std in zip( Temp.index, Temp['mean'].values, Temp['std'].values):
_ = ax[0].plot([i,i], [mean - std, mean + std], 'r-', alpha = 0.5)
_ = ax[0].set_xlim([-1, len(Temp.index) +1])
_ = ax[0].set_ylim([-1, 1])
_ = ax[0].set_ylabel('The Effect of the Paid Shipping Fees (Logarithm Scale)')
_ = ax[0].set_xlabel('Ordered Categories')
_ = ax[0].set_title('Variation in the Effect of the Paid Shipping Fees by Category', fontsize = 18)
# Bottom
Temp = VSI_Parm.copy()
Temp = Temp[df_search(Temp, 'beta')]
Temp.columns = Category_df['Category_Name']
n = 50
_ = sns.boxplot(ax = ax[1], data = Temp.iloc[:,:n])
_ = ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)
_ = ax[1].set_ylim([-2, 2])
_ = ax[1].set_title('The Effect of the Paid Shipping Fees Distributions with respect to Categories', fontsize = 18)
_ = ax[1].set_xlabel('Category Name')
del Temp
fig, ax = plt.subplots(figsize=(7, 6))
t = np.arange(2)
Temp = VSI_Parm.copy()
for a, b in zip(Temp[df_search(Temp, 'alpha')].mean(axis=0), Temp[df_search(Temp, 'beta')].mean(axis=0)):
ax.plot(t, a + b*t, color='DarkBlue', marker='o', linestyle='--', linewidth=1, markersize=4, alpha=0.4)
delta = 0.1
_ = ax.set_xlim([t[0]-delta, t[1]+delta])
_ = ax.set_ylim([1, 5])
_ = ax.set_xticks([0, 1])
_ = ax.set_xticklabels(['Buyer', 'Seller'])
_ = ax.set_title('Fitted Relationships by Category')
_ = ax.set_xlabel("Shipping Fee by")
_ = ax.set_ylabel(' Estimated Price (Logarithm Scale)')
Temp = list_search(Category_df.iloc[:,0].to_list(), 'Styling Products')[0]
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Selected Category:' + Style.RESET_ALL + ' %s' % Temp)
Selected_Category = Category_df.loc[Category_df.Category_Name == Temp, 'Category'].values[0]
del Temp
Selected Category: Beauty/Hair Care/Styling Products
model_code = """data {int<lower=0> N; int<lower=0> M;
int<lower=1,upper=M> category[N];
int<lower=0,upper=M> pred;
vector[N] x; vector[N] y;
real x_pred;}
parameters {vector[M] alpha; vector[M] beta;
real mu_alpha; real mu_beta;
real<lower=0> sigma; real<lower=0> sigma_alpha; real<lower=0> sigma_beta;
real y_pred;}
model {mu_alpha ~ normal(0, 100); mu_beta ~ normal(0, 100);
alpha ~ normal(mu_alpha, sigma_alpha); beta ~ normal(mu_beta, sigma_beta);
y ~ normal(alpha[category] + beta[category].*x, sigma);
y_pred ~ normal(alpha[pred + 1] + beta[pred + 1].*x_pred, sigma);}
"""
IT = int(1e3)
model = StanModel(model_code = model_code)
VSI_pred = model.sampling(data = {'N': Data.shape[0],
'M': len(Data['Category'].unique()),
'category': Data['Category'].values + 1,
'x': Data['Shipping Fee'].values,
'y': Data['Price_Log'].values,
'pred': Selected_Category,
'x_pred': Data.loc[Data.Category == Selected_Category,'Shipping Fee'].values.mean()
},
iter= IT, chains=2)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_0e0980e970985f4a6f6b12bc59a9f027 NOW. WARNING:pystan:Maximum (flat) parameter count (1000) exceeded: skipping diagnostic tests for n_eff and Rhat. To run all diagnostics call pystan.check_hmc_diagnostics(fit)
_ = az.plot_trace(VSI_pred, var_names = ['y_pred'], compact = True, rug = True , figsize = (14, 4))
_ = az.plot_posterior(VSI_pred, var_names = ['y_pred'], point_estimate = 'mean', figsize = (7, 4))
fig, ax = plt.subplots(1, 2, figsize=(16, 6.5))
ax = ax.ravel()
Temp0 = VSI_pred['y_pred']
_ = ax[0].scatter(x= np.arange(len(Temp0)), y= Temp0, color = 'SkyBlue', edgecolors = 'Navy')
_ = ax[0].hlines(Temp0.mean(), -1, len(Temp0)+1, linestyles='-',
color = 'OrangeRed', lw = 3, label = 'mean = %.2f' % Temp0.mean())
_ = ax[0].set_title('Predictions: %s' % Category_df.loc[Category_df.Category == Selected_Category,
'Category_Name'].values[0], fontsize = 16)
_ = ax[0].grid(True)
_ = ax[0].set_xlim([-1, len(Temp0)+1])
_ = ax[0].set_ylim([-1, 6])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].legend(bbox_to_anchor=(1, 1), fontsize = 14)
Temp0 = np.exp(Temp0)
_ = ax[1].scatter(x= np.arange(len(Temp0)), y= Temp0, color = 'MediumSeaGreen', edgecolors = 'Navy')
_ = ax[1].hlines(Temp0.mean(), -1, len(Temp0)+1, linestyles='-',
color = 'OrangeRed', lw = 3, label = 'mean = %.2f' % Temp0.mean())
_ = ax[1].set_title('Predictions: %s' % Category_df.loc[Category_df.Category == Selected_Category,
'Category_Name'].values[0], fontsize = 16)
_ = ax[1].grid(True)
_ = ax[1].set_xlim([-1, len(Temp0)+1])
_ = ax[1].set_ylim([0, 160])
_ = ax[1].set_ylabel('Estimated Price')
_ = ax[1].legend(bbox_to_anchor=(1, 1), fontsize = 14)
del Temp0
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Exact:')
display(Data.loc[Data.Category == Selected_Category, ['Price', 'Price_Log']].describe().T)
Exact:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Price | 28.0 | 17.857143 | 10.582005 | 4.000000 | 8.750000 | 17.500000 | 24.000000 | 40.000000 |
| Price_Log | 28.0 | 2.692309 | 0.674433 | 1.410987 | 2.179172 | 2.867495 | 3.182212 | 3.691376 |